Feature selection

1. With univariate statistical tests


In [21]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification) 
import pandas as pd
#from pandas import read_csv 
from numpy import set_printoptions 
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2

In [51]:
# load data 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pd.read_csv(filename, names=names) 
#print(dataframe.columns)

In [23]:
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8]

In [24]:
# feature selection (4 features) 
kbest = SelectKBest(score_func=chi2, k=4)

In [26]:
fit = kbest.fit(X, Y)

In [64]:
# Get indices of most important features
# This must be called after 'fit' method
idxs_selected = kbest.get_support(indices=True)
print(idxs_selected)

# print the names of the cols represented by the indices
colnms = dataframe.columns
for colidx in idxs_selected:
    colnm = colnms[colidx]
    print(colnm)


[1 4 5 7]
plas
test
mass
age

In [28]:
# summarize scores 
set_printoptions(precision=3) 
print(fit.scores_)


[  111.52   1411.887    17.605    53.108  2175.565   127.669     5.393
   181.304]

In [29]:
fit_features = fit.transform(X)

In [50]:
dataframe_new = pd.DataFrame(fit_features, columns=new_features)        
dataframe_new.head(5)


Out[50]:
plas test mass age
0 148.0 0.0 33.6 50.0
1 85.0 0.0 26.6 31.0
2 183.0 0.0 23.3 32.0
3 89.0 94.0 28.1 21.0
4 137.0 168.0 43.1 33.0

In [31]:
# summarize selected features 
print(fit_features[0:5,:])


[[ 148.     0.    33.6   50. ]
 [  85.     0.    26.6   31. ]
 [ 183.     0.    23.3   32. ]
 [  89.    94.    28.1   21. ]
 [ 137.   168.    43.1   33. ]]

2. Using Recursive Feature Elimination (RFE)


In [1]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [4]:
# load data
url = "pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
colnms = dataframe.columns
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

In [5]:
# feature extraction 
model = LogisticRegression()
rfe = RFE(model, 4) # Top 4
fit = rfe.fit(X, Y)
print("Num Features: ", fit.n_features_)
print("Selected Features: ", fit.support_)
print("Column names: ", colnms)
print("Feature Ranking: ", fit.ranking_)


Num Features:  4
Selected Features:  [ True  True False False False  True  True False]
Column names:  Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')
Feature Ranking:  [1 1 2 4 5 1 1 3]

3. Feature Importance with ExtraTreesClassifier


In [66]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier

In [67]:
# load data
url = "pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

In [68]:
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)

# Larger scores indicate greater feature importance
print(model.feature_importances_)


[ 0.112  0.218  0.09   0.08   0.078  0.149  0.119  0.155]